Import Grid points

Import points which represent 1km grid centroids of a grid covering London

grid        <- read.csv('X:/Projects/LAEI2013_TfL_Strategy/z_GridsForJames.csv')
grid        <- grid[,c('GridIdEx', 'Easting', 'Northing')]

names(grid) <- c('grid_exact_cut', 'easting', 'northing')

This gives a table of 3355 rows, show below.

grid_exact_cut easting northing
32 517500 201500
33 518500 201500
34 519500 201500
35 520500 201500
36 521500 201500
37 543500 201500

Import Traffic flows

Now import traffic flows by toid and grid exact cut, and edit/clean to get what we need

traffic_flows                 <- read.csv('X:/Projects/LAEI2013_Gdrive/TrafficFlows/Major_WithAdjustedIbus_AADT/z_AADT_ExactCut_WithIbus_2013.csv')
names(traffic_flows)          <- tolower(names(traffic_flows))

drop                          <- c('year', 'location_exactcut', 'boroughname_exactcut', 'tlrn', 'irr', 'motorwaynumber', 'petrolcar', 'dieselcar', 'electriccar', 'petrollgv', 'diesellgv', 'electriclgv', 'ltbus', 'coach')
traffic_flows                 <- traffic_flows[, !(names(traffic_flows) %in% drop)]

traffic_flows$toid            <- as.character(traffic_flows$toid)

traffic_flows$total_vehicles  <- (traffic_flows$motorcycle + traffic_flows$taxi + traffic_flows$car + traffic_flows$busandcoach + traffic_flows$lgv + traffic_flows$rigid2axle + traffic_flows$rigid3axle + traffic_flows$rigid4axle +     traffic_flows$artic3axle + traffic_flows$artic5axle + traffic_flows$artic6axle)

traffic_flows$total_light     <- (traffic_flows$motorcycle + traffic_flows$taxi + traffic_flows$car)

traffic_flows$total_heavy     <- (traffic_flows$busandcoach + traffic_flows$lgv + traffic_flows$rigid2axle + traffic_flows$rigid3axle + traffic_flows$rigid4axle + traffic_flows$artic3axle + traffic_flows$artic5axle + traffic_flows$artic6axle)

drop                          <- c('motorcycle', 'taxi', 'car', 'busandcoach', 'lgv', 'rigid2axle', 'rigid3axle', 'rigid4axle', 'artic3axle', 'artic5axle', 'artic6axle')
traffic_flows                 <- traffic_flows[, !(names(traffic_flows) %in% drop)]

This gives a table of 87996 rows, show below.

toid grid_exactcut_id dotref id speed total_vehicles total_light total_heavy
4000000030455238 2739 26145 0 48.92023 28315.654 23554.666 4760.9877
4000000030455245 2738 8468 0 59.23806 85580.440 68722.520 16857.9195
4000000030455249 2739 8468 0 13.03565 85580.440 68722.520 16857.9195
4000000030455250 2738 8468 0 27.10584 85580.440 68722.520 16857.9195
4000000030455254 818 0 13633 13.94029 4395.482 3897.315 498.1674
4000000030455271 2160 999957 0 30.18082 15938.228 13188.405 2749.8227

Import TOID information

road_info                    <- read.csv('erg_roads_info.csv')

names(road_info)             <- tolower(names(road_info))

road_info                    <- road_info[,c('toid', 'desc_term')]

road_info$toid               <- as.character(road_info$toid)

Import Emissions

Import emissions by toid and grid exact cut

emissions                     <- read.csv('X:/Projects/LAEI2013_TfL_Strategy/FilesForHerald4/EmissionsByLink/INPUT_RESULTS_LAEI_ExactCutIntersect_Major_2013_LAEI_V117.csv')
names(emissions)              <- tolower(names(emissions))

drop                          <- c('emissions', 'year', 'petrolcar', 'dieselcar', 'petrollgv', 'diesellgv', 'ltbus', 'coach', 'electriccar', 'electriclgv')
emissions                     <- emissions[, !(names(emissions) %in% drop)]

emissions$total_emissions     <- emissions$motorcycle + emissions$taxi + emissions$car + emissions$busandcoach + emissions$lgv + emissions$rigid + emissions$artic + emissions$rigid2axle + emissions$rigid3axle + emissions$rigid4axle + emissions$artic3axle + emissions$artic5axle + emissions$artic6axle

emissions$toid                <- as.character(emissions$toid)

drop                          <- c('motorcycle', 'taxi', 'car', 'busandcoach', 'lgv', 'rigid', 'artic', 'rigid2axle', 'rigid3axle', 'rigid4axle', 'artic3axle', 'artic5axle', 'artic6axle')
emissions                     <- emissions[, !(names(emissions) %in% drop)]

emissions                     <- emissions[!(emissions$pollutant %in% c('TB_PM10_SW', 'TB_PM25_SW')),]

Not going to need all the constiuent parts of the emissions, just the total by each. So aggregate up.

emissions$pollutant           <- as.character(emissions$pollutant)

emissions[grep('pm25', tolower(emissions$pollutant)),'pollutant'] <- 'pm25'
emissions[grep('pm10', tolower(emissions$pollutant)),'pollutant'] <- 'pm10'

emissions$gridid            <- as.character(emissions$gridid)
emissions$grid_exactcut_id  <- as.character(emissions$grid_exactcut_id)
emissions$dotref            <- as.character(emissions$dotref)

emissions$pollutant         <- tolower(emissions$pollutant)

emissions <- aggregate(total_emissions ~ gridid + toid + grid_exactcut_id + dotref + length + pollutant, data=emissions, FUN=sum)

This gives a table of 205496 rows, show below.

gridid toid grid_exactcut_id dotref length pollutant total_emissions
6772 4000000030098250 106 16001 0 no2 0
6772 4000000030098253 106 16001 0 no2 0
6772 4000000030413089 106 16001 0 no2 0
6772 4000000030413092 106 16001 0 no2 0
6083 4000000027909026 12 16001 0 no2 0
6083 4000000027909034 12 16001 0 no2 0

Aggregate 1km grids to 10km

First sum emissions by the 1km, to 1km grid, and create raster stacks from them

ukgrid = "+init=epsg:27700"

one_km_emissions <- aggregate(data = all_data, total_emissions ~ pollutant + easting + northing, FUN=sum)

sources          <- as.character((unique(one_km_emissions$pollutant)))

for (i in 1:length(sources)) {
  
  temp              <- one_km_emissions[one_km_emissions$pollutant == sources[i],]
  coordinates(temp) <- ~ easting + northing
  temp$pollutant    <- NULL
  gridded(temp)     <- TRUE
  proj4string(temp) <- CRS(ukgrid)
  temp              <- raster(temp)
  names(temp)       <- tolower(sources[i])

  if (i == 1) { one_km_emissions_raster <- temp} else {one_km_emissions_raster <- stack(one_km_emissions_raster, temp)}
  
}

NO2 1km plot

Now make a 10km plot

ten_km_emissions_raster  <- aggregate(one_km_emissions_raster, fact = 10, fun=sum)

rm(one_km_emissions, one_km_emissions_raster, temp, i)

NO2 10km plot

Now need to think about how these concentrations at 10km level, relate to the road link that contributed them, and get the relationship between them. First convert the raster to polygons.

ten_km_emissions_polygons <- rasterToPolygons(ten_km_emissions_raster)

NO2 10km polygons plot

Calculate each TOIDS contribution

This code now looks at each road, sees which overall 10km grid it is in, and then divides each pollutant by the total of that pollutant in the square. Puts the result into ‘ten_km_contribution’ variable.

result                                                  <- point.in.poly(all_data, ten_km_emissions_polygons)

result                                                  <- data.frame(result)

result$ten_km_contribution                              <- NA

result[result$pollutant == 'no2',]$ten_km_contribution    <- result[result$pollutant == 'no2',]$total_emissions    / result[result$pollutant == 'no2',]$no2
result[result$pollutant == 'nox',]$ten_km_contribution    <- result[result$pollutant == 'nox',]$total_emissions    / result[result$pollutant == 'nox',]$nox
result[result$pollutant == 'pm10',]$ten_km_contribution   <- result[result$pollutant == 'pm10',]$total_emissions   / result[result$pollutant == 'pm10',]$pm10
result[result$pollutant == 'pm25',]$ten_km_contribution   <- result[result$pollutant == 'pm25',]$total_emissions   / result[result$pollutant == 'pm25',]$pm25

rm(all_data)

Some toid exact cuts (9616) have zero emissions, remove those from the total of 205496.

result <- result[result$total_emissions > 0,]

Now before create a model to calculate a regression between the variables and the emissions, aggregate from TOID exact cuts, up to DotRef. This means that we go from 48970 ‘streets’ …..

result <- merge(aggregate(data = result, cbind(length, total_emissions, ten_km_contribution) ~ dotref + desc_term + pollutant, sum),
          aggregate(data = result, cbind(speed, total_vehicles, total_light, total_heavy) ~ dotref + desc_term + pollutant, mean),
          by = c('dotref', 'pollutant'))

#test <- merge(aggregate(data = result, cbind(length, total_emissions, ten_km_contribution) ~ dotref + pollutant, sum),
#          aggregate(data = result, cbind(speed, total_vehicles, total_light, total_heavy) ~ dotref + pollutant, mean),
#          by = c('dotref', 'pollutant'))

… down to 2990.

Exploring data relationships

Histogram of 10km emission contributions

ggplot(result, aes(ten_km_contribution)) +
  geom_histogram(bins = 60, aes(fill=pollutant)) +
  facet_wrap(.~pollutant, scales = 'free') +
  xlab('Contribution to 10km emissions')

Total vehicles v. emission contribution

Light vehicles v. emission contribution

Heavy vehicles v. emission contribution

Speed v. emission contribution

Length v. emission contribution

Create training and testing data

Take 80% of the data as a training data set

training_index <- sample(nrow(result), round(nrow(result) * 0.8,0))

training_data <- result[training_index,]

Seperate the other 20% as a testing data set

testing_data <- result[-training_index,]

Make linear regression model

Make a liner regression model using the training data

model_results <- data.frame(pollutant = NA,
                            intercept = as.numeric(''),
                            length_coef = as.numeric(''),
                            speed_coef = as.numeric(''),
                            total_light_coef = as.numeric(''),
                            total_heavy_coef = as.numeric(''),
                            stringsAsFactors = F)

model_results_list <- list()

for (i in 1:length(sources)) {
  
  model <- lm(ten_km_contribution ~ length + speed + total_light + total_heavy,
             data=training_data[training_data$pollutant == sources[i],])

  model_results_list[[i]] <- model
  
  model_results[i,] <- c(sources[i], as.numeric(coef(model)[1]), as.numeric(coef(model)[2]), as.numeric(coef(model)[3]), as.numeric(coef(model)[4]), as.numeric(coef(model)[5]))
  
  rm(model)
}

model_results$intercept         <- as.numeric(model_results$intercept)
model_results$length_coef       <- as.numeric(model_results$length_coef)
model_results$speed_coef        <- as.numeric(model_results$speed_coef)
model_results$total_light_coef  <- as.numeric(model_results$total_light_coef)
model_results$total_heavy_coef  <- as.numeric(model_results$total_heavy_coef)

Model result

Apply linear regression model

Now use the regression equation on the testing 20% data

testing_data$prediction <- NA

for (i in 1:length(sources)) {
  testing_data[testing_data$pollutant == sources[i],'prediction'] <-    model_results[model_results$pollutant == sources[i],]$intercept+(
                                                                  (testing_data[testing_data$pollutant == sources[i],]$length      * model_results[model_results$pollutant == sources[i],]$length_coef) +
                                                                  (testing_data[testing_data$pollutant == sources[i],]$speed       * model_results[model_results$pollutant == sources[i],]$speed_coef) +
                                                                  (testing_data[testing_data$pollutant == sources[i],]$total_light * model_results[model_results$pollutant == sources[i],]$total_light_coef) +
                                                                  (testing_data[testing_data$pollutant == sources[i],]$total_heavy * model_results[model_results$pollutant == sources[i],]$total_heavy_coef))
}

Results

Plot the prediction against the actuals for the testing 20%